library(rvest)
## Warning: package 'rvest' was built under R version 4.3.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
## Warning: package 'stringr' was built under R version 4.3.3
#URL scraping
url <- "https://en.wikipedia.org/wiki/List_of_current_Major_League_Baseball_stadiums"
# Read HTML
webpage <- read_html(url)
# find table and parse it
mlb_stadiums_table <- webpage %>%
html_nodes('table.wikitable') %>%
.[[1]] %>% # Assume the first 'wikitable' is the one we want; adjust if needed
html_table(fill = TRUE)
get_team_records <- function(year) {
url <- paste0("https://www.mlb.com/standings/mlb/", year)
# Read HTML from MLB page
standings_page <- read_html(url)
# XPath
standings_table <- standings_page %>% html_nodes(xpath = '//*[contains(concat(" ", @class, " "), " fZSGfm ")]') %>% html_table()
# extract the first table
standings_df <- standings_table[[1]]
return(standings_df)
}
# Clean 'mlb_stadiums_table' for team names that do not end with 's' or 'x'
mlb_stadiums_table <- mlb_stadiums_table %>%
mutate(Team = if_else(str_detect(Team, "[sx]$"),
Team,
str_replace(Team, "(.*?)[^sx]+$", "\\1")))
mlb_stadiums_table <- mlb_stadiums_table %>%
rowwise() %>%
mutate(Team = {
while(nchar(Team) > 0 && !str_detect(Team, "[sx]$")) {
Team <- str_sub(Team, 1, -2)
}
Team
})
mlb_stadiums_table$Opened <- as.numeric(as.character(mlb_stadiums_table$Opened))
## Warning: NAs introduced by coercion
# Replace 'Opened' year for specific teams
mlb_stadiums_table$Opened <- replace(mlb_stadiums_table$Opened, mlb_stadiums_table$Team == "Oakland Athletics", 1997)
mlb_stadiums_table$Opened <- replace(mlb_stadiums_table$Opened, mlb_stadiums_table$Team == "Tampa Bay Rays", 1990)
mlb_stadiums_table$Opened <- replace(mlb_stadiums_table$Opened, mlb_stadiums_table$Team == "Chicago Cubs", 1914)
# Change the string in the 'Name' column
mlb_stadiums_table$Name <- gsub("Oakland–Alameda County Coliseum", "Oakland Coliseum", mlb_stadiums_table$Name)
mlb_stadiums_table$Name <- gsub("Tropicana Field†", "Tropicana Field", mlb_stadiums_table$Name)
mlb_stadiums_table$Name <- gsub("LoanDepot Park", "loanDepot Park", mlb_stadiums_table$Name)
# Replace the weird character at the end of the "Name" column with an empty string
mlb_stadiums_table$Name <- gsub("‡$", "", mlb_stadiums_table$Name)
# Modify column title
colnames(mlb_stadiums_table)[which(names(mlb_stadiums_table) == "Distance to center field")] <- "Distance to center field (ft)"
# Remove everything after the first space in each cell
mlb_stadiums_table$`Distance to center field (ft)` <- sub(" .*", "", mlb_stadiums_table$`Distance to center field (ft)`)
library(rvest)
library(dplyr)
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.3.3
library(purrr)
library(stringr)
library(rvest)
library(dplyr)
library(purrr)
# Initialize empty df
all_standings <- data_frame(Year = integer(), Standings = list())
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## ℹ Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# return the standings table for a given year
get_yearly_standings <- function(year) {
url <- paste0("https://www.mlb.com/standings/mlb/", year)
webpage <- read_html(url)
standings_table <- webpage %>%
html_nodes(xpath = '//*[contains(concat(" ", @class, " "), " fZSGfm ")]') %>%
html_table(fill = TRUE)
if (length(standings_table) > 0) {
return(standings_table[[1]])
} else {
return(NULL)
}
}
years <- 1974:2023
# Iterate over nested df in list of lists
all_standings <- map_df(years, function(y) {
tryCatch({
# respect the server
Sys.sleep(2)
# Fetch
standings <- get_yearly_standings(y)
if (!is.null(standings)) {
return(tibble(Year = y, Standings = list(standings)))
} else {
return(tibble(Year = y, Standings = list(NA)))
}
}, error = function(e) {
message("Error retrieving data for year ", y)
return(tibble(Year = y, Standings = list(NA)))
})
})
for (i in 1:length(all_standings)) {
nested_standings <- all_standings[[i]]
if (!is.null(nested_standings)) {
for (j in 1:length(nested_standings)) {
nested_df <- nested_standings[[j]]
if (!is.null(nested_df) && "TEAM" %in% colnames(nested_df)) {
nested_df$TEAM <- gsub("[wyz]$", "", nested_df$TEAM)
nested_standings[[j]] <- nested_df
}
}
all_standings[[i]] <- nested_standings
}
}
columns_to_keep <- c('TEAM', 'HOME', 'AWAY') # Add columns you want to keep
# Iteration
for (i in 1:length(all_standings)) {
nested_standings <- all_standings[[i]]
if (!is.null(nested_standings)) {
for (j in 1:length(nested_standings)) {
nested_df <- nested_standings[[j]]
if (!is.null(nested_df) && is.data.frame(nested_df)) {
nested_df <- nested_df[, columns_to_keep, drop = FALSE]
nested_standings[[j]] <- nested_df
}
}
all_standings[[i]] <- nested_standings
}
}
library(dplyr)
library(tidyr)
library(stringr)
library(purrr)
library(readr)
##
## Attaching package: 'readr'
## The following object is masked from 'package:rvest':
##
## guess_encoding
all_standings <- all_standings %>%
filter(!Year %in% c(2020, 2021))
# parse a win-loss
parse_record <- function(record) {
parts <- str_split(record, "-", n = 2, simplify = TRUE)
list(Wins = parse_number(parts[1]), Losses = parse_number(parts[2]))
}
# Create new df for home record and win percentages
home_win_df <- tibble(Team = character(), HomeRecord = character(), HomeWinPct = numeric())
# Looping
for(team in mlb_stadiums_table$Team) {
team_opened_year <- max(mlb_stadiums_table$Opened[mlb_stadiums_table$Team == team], 1974)
team_wins <- 0
team_losses <- 0
# Now, loop through each year from opened to 2023 and collect the home wins and losses
for(year in seq(team_opened_year, 2023)) {
standings_df <- tryCatch({
all_standings$Standings[[which(all_standings$Year == year)]]
}, error = function(e) {
return(NULL)
})
if(!is.null(standings_df) && "TEAM" %in% colnames(standings_df)) {
# Find the team's home record for this year
team_record <- standings_df %>%
filter(str_detect(TEAM, regex(paste0("^", team, "$"), ignore_case = TRUE))) %>%
pull(HOME)
if(length(team_record) == 1) {
# Parse the home record and accumulate the wins and losses
record_parts <- parse_record(team_record)
team_wins <- team_wins + record_parts$Wins
team_losses <- team_losses + record_parts$Losses
} else {
message(paste("Home record for team", team, "in year", year, "not found or is ambiguous."))
}
} else {
message(paste("Standings dataframe for year", year, "is NULL or does not have the expected columns."))
}
}
# Calculate win %
total_games <- team_wins + team_losses
team_win_pct <- if(total_games > 0) team_wins / total_games else NA
# Format record and %
home_record_str <- paste(team_wins, team_losses, sep = "-")
home_win_pct <- round(team_win_pct, 3)
# Append
home_win_df <- rbind(home_win_df, tibble(Team = team, HomeRecord = home_record_str, HomeWinPct = home_win_pct))
}
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Home record for team Los Angeles Angels in year 1974 not found or is ambiguous.
## Home record for team Los Angeles Angels in year 1975 not found or is ambiguous.
## Home record for team Los Angeles Angels in year 1976 not found or is ambiguous.
## Home record for team Los Angeles Angels in year 1977 not found or is ambiguous.
## Home record for team Los Angeles Angels in year 1978 not found or is ambiguous.
## Home record for team Los Angeles Angels in year 1979 not found or is ambiguous.
## Home record for team Los Angeles Angels in year 1980 not found or is ambiguous.
## Home record for team Los Angeles Angels in year 1981 not found or is ambiguous.
## Home record for team Los Angeles Angels in year 1982 not found or is ambiguous.
## Home record for team Los Angeles Angels in year 1983 not found or is ambiguous.
## Home record for team Los Angeles Angels in year 1984 not found or is ambiguous.
## Home record for team Los Angeles Angels in year 1985 not found or is ambiguous.
## Home record for team Los Angeles Angels in year 1986 not found or is ambiguous.
## Home record for team Los Angeles Angels in year 1987 not found or is ambiguous.
## Home record for team Los Angeles Angels in year 1988 not found or is ambiguous.
## Home record for team Los Angeles Angels in year 1989 not found or is ambiguous.
## Home record for team Los Angeles Angels in year 1990 not found or is ambiguous.
## Home record for team Los Angeles Angels in year 1991 not found or is ambiguous.
## Home record for team Los Angeles Angels in year 1992 not found or is ambiguous.
## Home record for team Los Angeles Angels in year 1993 not found or is ambiguous.
## Home record for team Los Angeles Angels in year 1994 not found or is ambiguous.
## Home record for team Los Angeles Angels in year 1995 not found or is ambiguous.
## Home record for team Los Angeles Angels in year 1996 not found or is ambiguous.
## Home record for team Los Angeles Angels in year 1997 not found or is ambiguous.
## Home record for team Los Angeles Angels in year 1998 not found or is ambiguous.
## Home record for team Los Angeles Angels in year 1999 not found or is ambiguous.
## Home record for team Los Angeles Angels in year 2000 not found or is ambiguous.
## Home record for team Los Angeles Angels in year 2001 not found or is ambiguous.
## Home record for team Los Angeles Angels in year 2002 not found or is ambiguous.
## Home record for team Los Angeles Angels in year 2003 not found or is ambiguous.
## Home record for team Los Angeles Angels in year 2004 not found or is ambiguous.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Home record for team Los Angeles Dodgers in year 1981 not found or is ambiguous.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Home record for team Kansas City Royals in year 1981 not found or is ambiguous.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Home record for team Cleveland Guardians in year 1994 not found or is ambiguous.
## Home record for team Cleveland Guardians in year 1995 not found or is ambiguous.
## Home record for team Cleveland Guardians in year 1996 not found or is ambiguous.
## Home record for team Cleveland Guardians in year 1997 not found or is ambiguous.
## Home record for team Cleveland Guardians in year 1998 not found or is ambiguous.
## Home record for team Cleveland Guardians in year 1999 not found or is ambiguous.
## Home record for team Cleveland Guardians in year 2000 not found or is ambiguous.
## Home record for team Cleveland Guardians in year 2001 not found or is ambiguous.
## Home record for team Cleveland Guardians in year 2002 not found or is ambiguous.
## Home record for team Cleveland Guardians in year 2003 not found or is ambiguous.
## Home record for team Cleveland Guardians in year 2004 not found or is ambiguous.
## Home record for team Cleveland Guardians in year 2005 not found or is ambiguous.
## Home record for team Cleveland Guardians in year 2006 not found or is ambiguous.
## Home record for team Cleveland Guardians in year 2007 not found or is ambiguous.
## Home record for team Cleveland Guardians in year 2008 not found or is ambiguous.
## Home record for team Cleveland Guardians in year 2009 not found or is ambiguous.
## Home record for team Cleveland Guardians in year 2010 not found or is ambiguous.
## Home record for team Cleveland Guardians in year 2011 not found or is ambiguous.
## Home record for team Cleveland Guardians in year 2012 not found or is ambiguous.
## Home record for team Cleveland Guardians in year 2013 not found or is ambiguous.
## Home record for team Cleveland Guardians in year 2014 not found or is ambiguous.
## Home record for team Cleveland Guardians in year 2015 not found or is ambiguous.
## Home record for team Cleveland Guardians in year 2016 not found or is ambiguous.
## Home record for team Cleveland Guardians in year 2017 not found or is ambiguous.
## Home record for team Cleveland Guardians in year 2018 not found or is ambiguous.
## Home record for team Cleveland Guardians in year 2019 not found or is ambiguous.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Home record for team Tampa Bay Rays in year 1990 not found or is ambiguous.
## Home record for team Tampa Bay Rays in year 1991 not found or is ambiguous.
## Home record for team Tampa Bay Rays in year 1992 not found or is ambiguous.
## Home record for team Tampa Bay Rays in year 1993 not found or is ambiguous.
## Home record for team Tampa Bay Rays in year 1994 not found or is ambiguous.
## Home record for team Tampa Bay Rays in year 1995 not found or is ambiguous.
## Home record for team Tampa Bay Rays in year 1996 not found or is ambiguous.
## Home record for team Tampa Bay Rays in year 1997 not found or is ambiguous.
## Home record for team Tampa Bay Rays in year 1998 not found or is ambiguous.
## Home record for team Tampa Bay Rays in year 1999 not found or is ambiguous.
## Home record for team Tampa Bay Rays in year 2000 not found or is ambiguous.
## Home record for team Tampa Bay Rays in year 2001 not found or is ambiguous.
## Home record for team Tampa Bay Rays in year 2002 not found or is ambiguous.
## Home record for team Tampa Bay Rays in year 2003 not found or is ambiguous.
## Home record for team Tampa Bay Rays in year 2004 not found or is ambiguous.
## Home record for team Tampa Bay Rays in year 2005 not found or is ambiguous.
## Home record for team Tampa Bay Rays in year 2006 not found or is ambiguous.
## Home record for team Tampa Bay Rays in year 2007 not found or is ambiguous.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
# Create new df for away record and win percentages
away_win_df <- tibble(Team = character(), AwayRecord = character(), AwayWinPct = numeric())
# Looping
for(team in mlb_stadiums_table$Team) {
team_opened_year <- max(mlb_stadiums_table$Opened[mlb_stadiums_table$Team == team], 1974)
team_wins <- 0
team_losses <- 0
# Now, loop through each year from opened to 2023 and collect the away wins and losses
for(year in seq(team_opened_year, 2023)) {
# Get nested df
standings_df <- tryCatch({
all_standings$Standings[[which(all_standings$Year == year)]]
}, error = function(e) {
return(NULL)
})
if(!is.null(standings_df) && "TEAM" %in% colnames(standings_df)) {
# Find the team's away record for this year
team_record <- standings_df %>%
filter(str_detect(TEAM, regex(paste0("^", team, "$"), ignore_case = TRUE))) %>%
pull(AWAY)
if(length(team_record) == 1) {
# Parse the away record and accumulate the wins and losses
record_parts <- parse_record(team_record)
team_wins <- team_wins + record_parts$Wins
team_losses <- team_losses + record_parts$Losses
} else {
message(paste("Away record for team", team, "in year", year, "not found or is ambiguous."))
}
} else {
message(paste("Standings dataframe for year", year, "is NULL or does not have the expected columns."))
}
}
# Calculate win %
total_games <- team_wins + team_losses
team_win_pct <- if(total_games > 0) team_wins / total_games else NA
# Formatting
away_record_str <- paste(team_wins, team_losses, sep = "-")
away_win_pct <- round(team_win_pct, 3)
# Append
away_win_df <- rbind(away_win_df, tibble(Team = team, AwayRecord = away_record_str, AwayWinPct = away_win_pct))
}
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Away record for team Los Angeles Angels in year 1974 not found or is ambiguous.
## Away record for team Los Angeles Angels in year 1975 not found or is ambiguous.
## Away record for team Los Angeles Angels in year 1976 not found or is ambiguous.
## Away record for team Los Angeles Angels in year 1977 not found or is ambiguous.
## Away record for team Los Angeles Angels in year 1978 not found or is ambiguous.
## Away record for team Los Angeles Angels in year 1979 not found or is ambiguous.
## Away record for team Los Angeles Angels in year 1980 not found or is ambiguous.
## Away record for team Los Angeles Angels in year 1981 not found or is ambiguous.
## Away record for team Los Angeles Angels in year 1982 not found or is ambiguous.
## Away record for team Los Angeles Angels in year 1983 not found or is ambiguous.
## Away record for team Los Angeles Angels in year 1984 not found or is ambiguous.
## Away record for team Los Angeles Angels in year 1985 not found or is ambiguous.
## Away record for team Los Angeles Angels in year 1986 not found or is ambiguous.
## Away record for team Los Angeles Angels in year 1987 not found or is ambiguous.
## Away record for team Los Angeles Angels in year 1988 not found or is ambiguous.
## Away record for team Los Angeles Angels in year 1989 not found or is ambiguous.
## Away record for team Los Angeles Angels in year 1990 not found or is ambiguous.
## Away record for team Los Angeles Angels in year 1991 not found or is ambiguous.
## Away record for team Los Angeles Angels in year 1992 not found or is ambiguous.
## Away record for team Los Angeles Angels in year 1993 not found or is ambiguous.
## Away record for team Los Angeles Angels in year 1994 not found or is ambiguous.
## Away record for team Los Angeles Angels in year 1995 not found or is ambiguous.
## Away record for team Los Angeles Angels in year 1996 not found or is ambiguous.
## Away record for team Los Angeles Angels in year 1997 not found or is ambiguous.
## Away record for team Los Angeles Angels in year 1998 not found or is ambiguous.
## Away record for team Los Angeles Angels in year 1999 not found or is ambiguous.
## Away record for team Los Angeles Angels in year 2000 not found or is ambiguous.
## Away record for team Los Angeles Angels in year 2001 not found or is ambiguous.
## Away record for team Los Angeles Angels in year 2002 not found or is ambiguous.
## Away record for team Los Angeles Angels in year 2003 not found or is ambiguous.
## Away record for team Los Angeles Angels in year 2004 not found or is ambiguous.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Away record for team Los Angeles Dodgers in year 1981 not found or is ambiguous.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Away record for team Kansas City Royals in year 1981 not found or is ambiguous.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Away record for team Cleveland Guardians in year 1994 not found or is ambiguous.
## Away record for team Cleveland Guardians in year 1995 not found or is ambiguous.
## Away record for team Cleveland Guardians in year 1996 not found or is ambiguous.
## Away record for team Cleveland Guardians in year 1997 not found or is ambiguous.
## Away record for team Cleveland Guardians in year 1998 not found or is ambiguous.
## Away record for team Cleveland Guardians in year 1999 not found or is ambiguous.
## Away record for team Cleveland Guardians in year 2000 not found or is ambiguous.
## Away record for team Cleveland Guardians in year 2001 not found or is ambiguous.
## Away record for team Cleveland Guardians in year 2002 not found or is ambiguous.
## Away record for team Cleveland Guardians in year 2003 not found or is ambiguous.
## Away record for team Cleveland Guardians in year 2004 not found or is ambiguous.
## Away record for team Cleveland Guardians in year 2005 not found or is ambiguous.
## Away record for team Cleveland Guardians in year 2006 not found or is ambiguous.
## Away record for team Cleveland Guardians in year 2007 not found or is ambiguous.
## Away record for team Cleveland Guardians in year 2008 not found or is ambiguous.
## Away record for team Cleveland Guardians in year 2009 not found or is ambiguous.
## Away record for team Cleveland Guardians in year 2010 not found or is ambiguous.
## Away record for team Cleveland Guardians in year 2011 not found or is ambiguous.
## Away record for team Cleveland Guardians in year 2012 not found or is ambiguous.
## Away record for team Cleveland Guardians in year 2013 not found or is ambiguous.
## Away record for team Cleveland Guardians in year 2014 not found or is ambiguous.
## Away record for team Cleveland Guardians in year 2015 not found or is ambiguous.
## Away record for team Cleveland Guardians in year 2016 not found or is ambiguous.
## Away record for team Cleveland Guardians in year 2017 not found or is ambiguous.
## Away record for team Cleveland Guardians in year 2018 not found or is ambiguous.
## Away record for team Cleveland Guardians in year 2019 not found or is ambiguous.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Away record for team Tampa Bay Rays in year 1990 not found or is ambiguous.
## Away record for team Tampa Bay Rays in year 1991 not found or is ambiguous.
## Away record for team Tampa Bay Rays in year 1992 not found or is ambiguous.
## Away record for team Tampa Bay Rays in year 1993 not found or is ambiguous.
## Away record for team Tampa Bay Rays in year 1994 not found or is ambiguous.
## Away record for team Tampa Bay Rays in year 1995 not found or is ambiguous.
## Away record for team Tampa Bay Rays in year 1996 not found or is ambiguous.
## Away record for team Tampa Bay Rays in year 1997 not found or is ambiguous.
## Away record for team Tampa Bay Rays in year 1998 not found or is ambiguous.
## Away record for team Tampa Bay Rays in year 1999 not found or is ambiguous.
## Away record for team Tampa Bay Rays in year 2000 not found or is ambiguous.
## Away record for team Tampa Bay Rays in year 2001 not found or is ambiguous.
## Away record for team Tampa Bay Rays in year 2002 not found or is ambiguous.
## Away record for team Tampa Bay Rays in year 2003 not found or is ambiguous.
## Away record for team Tampa Bay Rays in year 2004 not found or is ambiguous.
## Away record for team Tampa Bay Rays in year 2005 not found or is ambiguous.
## Away record for team Tampa Bay Rays in year 2006 not found or is ambiguous.
## Away record for team Tampa Bay Rays in year 2007 not found or is ambiguous.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
# Copy the data and create a dataframe
data <- "franchise att avg5 att avg10 att avg15 att 2023 att 2022 att 2021 att 2020 att 2019 att 2018 att 2017 att 2016 att 2015 att 2014 att 2013 att 2012 att 2011 att 2010 att 2009
Los Angeles Dodgers 2,895,498 3,335,131 3,378,547 3,837,079 3,861,408 2,804,693 0 3,974,309 3,857,500 3,765,856 3,703,312 3,764,815 3,782,337 3,743,527 3,324,246 2,935,139 3,562,320 3,761,669
St. Louis Cardinals 2,428,913 2,950,212 3,058,161 3,241,091 3,320,551 2,102,530 0 3,480,393 3,403,587 3,447,937 3,444,490 3,520,889 3,540,649 3,369,769 3,262,109 3,093,954 3,301,218 3,343,252
New York Yankees 2,333,896 2,795,813 3,061,264 3,269,016 3,136,207 1,959,854 0 3,304,404 3,482,855 3,146,966 3,063,405 3,193,795 3,401,624 3,279,589 3,542,406 3,653,680 3,765,807 3,719,358
Atlanta Braves 2,255,357 2,271,443 2,329,331 3,191,505 3,129,931 2,300,247 0 2,655,100 2,555,781 2,505,252 2,020,914 2,001,392 2,354,305 2,548,679 2,420,171 2,372,940 2,510,119 2,373,631
San Diego Padres 2,169,475 2,213,942 2,175,148 3,271,554 2,987,470 2,191,950 0 2,396,399 2,147,000 2,138,491 2,351,426 2,459,752 2,195,373 2,166,691 2,123,721 2,143,018 2,131,774 1,922,603
Houston Astros 2,133,444 2,226,348 2,162,845 3,052,347 2,688,998 2,068,509 0 2,857,367 2,980,549 2,403,671 2,306,623 2,153,585 1,751,829 1,651,883 1,607,733 2,067,016 2,331,490 2,521,076
Toronto Blue Jays 1,647,087 2,232,732 2,152,222 3,021,904 2,653,830 809,557 0 1,750,144 2,325,281 3,203,886 3,392,299 2,794,891 2,375,525 2,536,562 2,099,663 1,818,103 1,625,555 1,876,129
Boston Red Sox 1,989,434 2,455,264 2,639,475 2,672,130 2,625,089 1,725,323 0 2,924,627 2,895,575 2,917,678 2,955,434 2,880,694 2,956,089 2,833,333 3,043,003 3,054,001 3,046,445 3,062,699
Chicago Cubs 2,093,146 2,569,072 2,697,731 2,775,149 2,616,780 1,978,934 0 3,094,865 3,181,089 3,199,562 3,232,420 2,959,812 2,652,113 2,642,682 2,882,756 3,017,966 3,062,973 3,168,859
Colorado Rockies 2,027,450 2,389,642 2,518,054 2,607,935 2,597,428 1,938,645 0 2,993,244 3,015,880 2,953,650 2,602,524 2,506,789 2,680,329 2,793,828 2,630,458 2,909,777 2,875,245 2,665,080
New York Mets 1,813,098 2,125,927 2,248,685 2,573,555 2,564,737 1,484,665 0 2,442,532 2,224,995 2,460,622 2,789,602 2,569,753 2,148,808 2,135,657 2,242,803 2,378,549 2,559,738 3,154,262
San Francisco Giants 1,874,017 2,475,745 2,716,499 2,500,153 2,482,686 1,679,484 0 2,707,760 3,156,185 3,303,652 3,365,256 2,193,581 3,368,697 3,326,796 3,377,371 3,387,303 3,037,443 2,861,113
Los Angeles Angels 1,925,816 2,479,372 2,702,168 2,640,575 2,457,461 1,512,033 0 3,019,012 3,020,216 3,019,583 3,016,142 3,012,765 3,095,935 3,019,505 3,061,770 3,166,321 3,250,814 3,240,386
Milwaukee Brewers 1,944,270 2,278,551 2,468,890 2,551,317 2,422,420 1,824,282 0 2,923,333 2,850,875 2,558,722 2,314,614 2,542,558 2,797,384 2,531,105 2,831,385 3,071,373 2,776,531 3,037,451
Seattle Mariners 1,597,107 2,012,861 1,985,923 2,690,418 2,287,267 1,215,985 0 1,791,863 2,299,489 2,135,445 2,267,928 3,375,882 2,064,334 1,761,546 1,721,920 1,896,321 2,085,168 2,195,284
Philadelphia Phillies 1,914,530 1,980,621 2,487,532 3,052,605 2,276,736 1,515,890 0 2,727,421 2,158,124 1,905,354 1,915,144 1,831,080 2,423,852 3,012,403 3,565,718 3,680,718 3,647,249 3,600,693
Washington Nationals 1,523,511 2,035,331 2,064,157 1,865,832 2,026,401 1,465,543 0 2,259,781 2,529,604 2,524,980 2,481,938 2,619,843 2,579,389 2,652,422 2,370,794 1,940,478 1,828,066 1,817,280
Texas Rangers 1,757,539 2,132,357 2,371,351 2,533,062 2,011,381 2,110,258 0 2,132,994 2,107,107 2,507,760 2,710,402 2,491,875 2,718,733 3,178,273 3,460,280 2,946,949 2,505,171 2,156,016
Chicago White Sox 1,378,416 1,528,329 1,699,821 1,669,628 1,976,344 1,596,385 0 1,649,725 1,608,817 1,629,470 1,746,293 1,755,810 1,650,821 1,768,413 1,965,955 2,001,117 2,194,378 2,284,164
Minnesota Twins 1,475,921 1,782,465 2,125,776 1,974,124 1,801,128 1,310,199 0 2,294,152 1,959,197 2,051,279 1,963,912 2,220,054 2,250,606 2,477,644 2,776,354 3,168,107 3,223,640 2,416,237
Arizona Diamondbacks 1,348,980 1,731,206 1,861,041 1,961,182 1,605,199 1,043,010 0 2,135,510 2,242,695 2,134,375 2,036,216 2,080,145 2,073,730 2,134,795 2,177,617 2,105,432 2,056,519 2,129,183
Detroit Tigers 1,153,616 1,808,376 2,124,377 1,612,876 1,551,149 1,102,623 0 1,501,430 1,856,970 2,321,599 2,493,859 2,726,048 2,917,209 3,083,397 3,028,033 2,642,045 2,461,237 2,567,185
Cincinnati Reds 1,349,558 1,700,432 1,860,527 2,038,310 1,395,770 1,505,024 0 1,808,685 1,629,356 1,836,917 1,894,085 2,419,506 2,476,664 2,534,369 2,347,251 2,213,498 2,060,551 1,747,919
Baltimore Orioles 1,081,240 1,595,622 1,720,778 1,936,798 1,368,367 793,229 0 1,307,807 1,564,192 2,028,424 2,172,344 2,320,590 2,464,473 2,357,561 2,102,240 1,755,461 1,733,018 1,907,163
Cleveland Guardians 1,196,589 1,437,575 1,503,400 1,834,068 1,295,869 1,114,368 0 1,738,642 1,926,701 2,048,138 1,591,667 1,388,905 1,437,393 1,572,926 1,603,596 1,840,835 1,391,644 1,766,242
Kansas City Royals 1,044,862 1,633,253 1,664,054 1,307,052 1,277,986 1,159,613 0 1,479,659 1,665,107 2,220,370 2,557,712 2,708,549 1,956,482 1,750,754 1,739,859 1,724,450 1,615,327 1,797,887
Pittsburgh Pirates 1,047,804 1,581,396 1,686,295 1,630,624 1,257,458 859,498 0 1,491,439 1,465,316 1,919,447 2,249,021 2,498,596 2,442,564 2,256,862 2,091,918 1,940,429 1,613,399 1,577,853
Tampa Bay Rays 901,647 1,089,712 1,282,417 1,440,301 1,128,127 761,072 0 1,178,735 1,154,973 1,253,619 1,286,163 1,247,668 1,446,464 1,510,300 1,559,681 1,529,188 1,864,999 1,874,962
Miami Marlins 704,845 1,118,426 1,299,973 1,162,819 907,487 642,617 0 811,302 811,104 1,651,997 1,712,417 1,752,235 1,732,283 1,586,322 2,219,444 1,520,562 1,524,894 1,464,109
Oakland Athletics 796,779 1,232,654 1,341,255 832,352 787,902 701,430 0 1,662,211 1,573,616 1,475,721 1,521,506 1,768,175 2,003,628 1,809,302 1,679,013 1,476,792 1,418,391 1,408,783
30 record(s)"
# Remove whitespace
data <- trimws(data)
# Split data into lines
lines <- strsplit(data, "\n")[[1]]
# column names
column_names <- strsplit(lines[1], "\t")[[1]]
# Split the remaining lines to get data
data_lines <- lapply(lines[-1], function(line) {
parts <- unlist(strsplit(line, "\t"))
franchise_name <- parts[1]
parts <- parts[-1]
parts <- as.numeric(gsub(",", "", parts))
# Combine franchise name with attendance values
c(franchise_name, parts)
})
attendance_df <- as.data.frame(do.call(rbind, data_lines), stringsAsFactors = FALSE)
colnames(attendance_df) <- column_names
attendance_df <- attendance_df[-31, ]
columns_to_delete <- c("att 2020", "att 2021", "att avg5", "att avg10", "att avg15")
attendance_df <- attendance_df[, !(names(attendance_df) %in% columns_to_delete)]
attendance_df[, grep("^att", names(attendance_df))] <- apply(attendance_df[, grep("^att", names(attendance_df))], 2, function(x) as.numeric(gsub(",", "", x)))
# Divide attendance by 81 to represent attendance per home game
attendance_df[, grep("^att", names(attendance_df))] <- attendance_df[, grep("^att", names(attendance_df))] / 81
# Compute avg attendance fa each team
attendance_df$average_attendance <- rowMeans(attendance_df[, grep("^att", names(attendance_df))], na.rm = TRUE)
hitta_data <- "Rk. Team Venue Year Park Factor wOBACon xwOBACon BACON xBACON HardHit R OBP H 1B 2B 3B HR BB SO PA
1 Estadio Alfredo Harp Helu 2023 150 177 114 149 101 114 225 128 137 90 173 173 318 106 120 646
2 Rockies Coors Field 2023 113 112 103 113 103 101 128 111 117 114 125 215 103 99 90 18,282
3 Red Sox Fenway Park 2023 108 107 100 109 101 102 117 108 111 110 128 114 92 99 93 17,912
4 London Stadium 2023 106 110 104 116 108 116 112 109 108 112 130 0 67 82 102 599
5 Royals Kauffman Stadium 2023 106 102 104 102 103 104 112 106 106 106 109 151 95 105 87 17,889
6 Rangers Globe Life Field 2023 106 107 102 104 100 105 112 104 105 101 103 71 133 104 97 19,412
7 Cardinals Busch Stadium 2023 103 101 105 102 104 103 106 104 106 110 103 78 98 100 90 17,939
8 Braves Truist Park 2023 103 106 106 105 105 107 106 101 103 102 95 99 119 97 106 18,058
9 Nationals Nationals Park 2023 103 99 101 101 101 100 106 104 108 113 100 87 98 92 82 17,543
10 Reds Great American Ball Park 2023 102 103 97 99 98 96 104 101 98 94 89 97 127 105 102 17,755
11 Cubs Wrigley Field 2023 101 101 100 101 101 98 102 101 101 102 94 114 105 101 101 17,628
12 Marlins loanDepot park 2023 101 101 101 103 102 98 102 102 106 109 110 89 87 90 98 17,413
13 Angels Angel Stadium 2023 100 102 101 100 100 98 100 99 98 98 82 103 115 103 107 17,966
14 Astros Minute Maid Park 2023 100 99 98 100 98 96 100 102 101 103 101 112 92 103 96 18,949
15 Twins Target Field 2023 100 105 102 102 99 101 100 99 98 92 110 121 107 101 112 18,168
16 White Sox Guaranteed Rate Field 2023 99 100 99 101 101 99 98 100 100 105 88 55 98 98 104 18,157
17 D-backs Chase Field 2023 99 98 95 100 98 100 98 99 102 103 106 209 79 98 98 19,262
18 Dodgers Dodger Stadium 2023 99 99 100 96 99 99 98 96 95 90 99 47 122 97 101 18,242
19 Tigers Comerica Park 2023 99 99 100 100 100 101 98 100 99 101 93 141 90 103 99 17,518
20 Phillies Citizens Bank Park 2023 99 102 101 100 100 101 98 98 97 93 100 106 114 97 106 19,048
21 Yankees Yankee Stadium 2023 99 99 103 96 101 103 98 97 95 91 91 45 123 102 102 17,256
22 Rays Tropicana Field 2023 98 101 98 99 98 99 96 96 96 91 105 121 103 93 111 17,740
23 Pirates PNC Park 2023 98 96 100 99 101 101 96 101 100 103 111 91 71 101 96 17,812
24 Brewers American Family Field 2023 98 99 100 98 100 96 96 98 95 94 91 66 107 109 109 17,825
25 Blue Jays Rogers Centre 2023 97 97 97 97 97 97 94 98 95 93 103 55 95 106 101 17,604
26 Mets Citi Field 2023 97 95 98 94 97 100 94 98 91 91 82 61 104 111 103 17,647
27 Orioles Oriole Park at Camden Yards 2023 96 95 101 96 101 103 92 96 99 105 88 112 91 93 97 17,765
28 Athletics Oakland Coliseum 2023 96 96 97 96 96 94 92 97 93 91 100 110 88 108 104 17,675
29 Padres Petco Park 2023 96 94 99 95 99 99 92 98 94 95 98 49 89 110 100 17,116
30 Guardians Progressive Field 2023 94 92 97 95 101 96 88 97 98 101 110 105 67 96 95 17,571
31 Giants Oracle Park 2023 94 94 97 96 98 102 88 94 99 105 88 92 89 83 98 17,067
32 Mariners T-Mobile Park 2023 93 96 97 96 98 96 86 94 92 93 91 69 94 95 113 17,578
33 Journey Bank Ballpark 2023 78 74 89 80 93 88 61 83 90 109 67 0 47 81 82 262"
# Convert to df
hitta_frame <- read.table(textConnection(hitta_data), header = TRUE, sep = "\t", quote = "", dec=".", fill=TRUE, check.names=FALSE)
rows_to_remove <- c(1, 4, 33)
hitta_frame <- hitta_frame[-rows_to_remove, ]
columns_to_delete <- c("Rk.", "Year", "wOBACon", "xwOBACon", "BACON", "xBACON")
hitta_frame <- hitta_frame[, !(names(hitta_frame) %in% columns_to_delete)]
# Create df from data
Xtra_distance <- data.frame(
Rk = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30),
Team = c("Rockies", "D-backs", "Rays", "Braves", "Brewers", "Phillies", "Astros", "Rangers", "Twins", "Dodgers", "Guardians", "Royals", "Pirates", "Padres", "Athletics", "Marlins", "Cardinals", "Reds", "Orioles", "Tigers", "White Sox", "Mariners", "Angels", "Blue Jays", "Nationals", "Red Sox", "Giants", "Yankees", "Cubs", "Mets"),
Venue = c("Coors Field", "Chase Field", "Tropicana Field", "Truist Park", "American Family Field", "Citizens Bank Park", "Minute Maid Park", "Globe Life Field", "Target Field", "Dodger Stadium", "Progressive Field", "Kauffman Stadium", "PNC Park", "Petco Park", "Oakland Coliseum", "loanDepot Park", "Busch Stadium", "Great American Ball Park", "Oriole Park at Camden Yards", "Comerica Park", "Guaranteed Rate Field", "T-Mobile Park", "Angel Stadium", "Rogers Centre", "Nationals Park", "Fenway Park", "Oracle Park", "Yankee Stadium", "Wrigley Field", "Citi Field"),
`2023` = c(18.0, 9.5, 5.3, 6.6, 2.5, -0.6, -3.1, 5.2, 2.0, -3.4, -4.8, 5.5, -1.2, -3.9, -2.9, -1.2, -2.3, 1.8, -2.3, -1.0, -2.3, -1.8, -2.0, -4.1, -2.0, -5.7, -4.4, -5.7, -4.4, -2.0))
library(dplyr)
# Join the '2023' column from Xtra_distance to mlb_stadiums_table
mlb_stadiums_table <- mlb_stadiums_table %>%
left_join(select(Xtra_distance, Venue, X2023), by = c("Name" = "Venue")) %>%
rename(`2023_distance` = X2023)
dataJose <- "
BREAKDOWN AB R H 2B 3B HR RBI BB HBP SO SB CS AVG OBP SLG OPS
Home 296 47 92 16 1 9 35 24 6 52 19 3 .311 .371 .463 .834
Away 294 65 112 23 3 15 46 34 3 32 13 3 .381 .449 .633 1.081
"
JoseAltuve <- read.table(text = dataJose, header = TRUE)
dataBregman <- "
BREAKDOWN AB R H 2B 3B HR RBI BB HBP SO SB CS AVG OBP SLG OPS
Home 266 42 74 13 2 9 30 27 1 44 10 2 .278 .343 .444 .787
Away 290 46 84 26 3 10 41 28 6 53 7 3 .290 .360 .503 .863
"
AlexBregman <- read.table(text = dataBregman, header = TRUE)
dataSpringer <- "
BREAKDOWN AB R H 2B 3B HR RBI BB HBP SO SB CS AVG OBP SLG OPS
Home 271 51 74 16 0 16 42 35 6 51 2 5 .273 .366 .509 .875
Away 277 61 81 13 0 18 43 29 5 60 3 2 .292 .367 .534 .902 "
GeorgeSpringer <- read.table(text = dataSpringer, header = TRUE)
dataCorrea <- "
BREAKDOWN G AB PA H 1B 2B 3B HR R RBI BB IBB SO HBP SF SH GDP SB CS AVG
Home 51 186 212 62 38 13 0 11 31 40 23 4 31 1 2 0 6 1 0 .333
Away 58 236 269 71 45 12 1 13 51 44 30 1 61 1 2 0 6 1 1 .301"
CarlosCorrea <- read.table(text = dataCorrea, header = TRUE)
# Create MLB Team Earned Run Average df
mlb_era <- data.frame(
Rank = 1:30,
Team = c("San Diego Padres", "Seattle Mariners", "Milwaukee Brewers", "Toronto Blue Jays", "Minnesota Twins", "Tampa Bay Rays", "Philadelphia Phillies", "Baltimore Orioles", "New York Yankees", "Cleveland Guardians",
"Houston Astros", "San Francisco Giants", "Chicago Cubs", "Los Angeles Dodgers", "Atlanta Braves", "Miami Marlins", "Texas Rangers", "Detroit Tigers",
"New York Mets", "Arizona Diamondbacks", "Boston Red Sox", "Pittsburgh Pirates", "Los Angeles Angels", "St. Louis Cardinals", "Cincinnati Reds", "Chicago White Sox", "Washington Nationals", "Kansas City Royals", "Oakland Athletics", "Colorado Rockies"),
`2023` = c(3.73, 3.74, 3.75, 3.78, 3.85, 3.89, 3.90, 3.96, 3.97, 3.97, 3.98, 4.02, 4.08, 4.10, 4.15,
4.23, 4.24, 4.24, 4.30, 4.39, 4.52, 4.60, 4.64, 4.81, 4.83, 4.88, 5.02, 5.17, 5.48, 5.68),
Home = c(3.33, 3.42, 3.69, 3.68, 3.77, 3.83, 3.54, 3.91, 3.92, 3.68, 4.36, 3.52, 3.94, 3.58, 4.17,
4.09, 4.65, 4.47, 3.89, 4.13, 4.73, 4.64, 4.46, 4.87, 4.90, 4.71, 5.08, 4.96, 4.96, 6.12),
Away = c(4.14, 4.07, 3.82, 3.87, 3.94, 3.96, 4.28, 4.01, 4.02, 4.28, 3.59, 4.55, 4.23, 4.66, 4.13,
4.38, 3.83, 4.00, 4.73, 4.65, 4.29, 4.56, 4.83, 4.76, 4.75, 5.06, 4.96, 5.39, 6.10, 5.20),
`2022` = c(3.81, 3.59, 3.83, 3.93, 3.98, 3.38, 3.90, 3.97, 3.30, 3.44, 2.85, 3.85, 4.00, 2.82, 3.51,
3.87, 4.22, 4.04, 3.62, 4.25, 4.53, 4.66, 3.77, 3.80, 4.86, 3.92, 5.00, 4.70, 4.52, 5.07)
)
# Create anotha df
payroll_data <- data.frame(
Team = c("New York Mets", "Los Angeles Dodgers", "New York Yankees", "Philadelphia Phillies", "Houston Astros", "Atlanta Braves", "Toronto Blue Jays", "Texas Rangers", "Chicago Cubs", "San Francisco Giants",
"Boston Red Sox", "St. Louis Cardinals", "Los Angeles Angels", "Arizona Diamondbacks", "San Diego Padres", "Chicago White Sox", "Colorado Rockies", "Seattle Mariners", "Washington Nationals", "Minnesota Twins",
"Kansas City Royals", "Detroit Tigers", "Milwaukee Brewers", "Cincinnati Reds", "Miami Marlins", "Cleveland Guardians", "Baltimore Orioles", "Tampa Bay Rays", "Pittsburgh Pirates", "Oakland Athletics"),
Rank = 1:30,
Payroll = c(322, 314, 302, 246, 241, 230, 226, 225, 223, 208, 181, 181, 175,
169, 167, 150, 147, 139, 131, 127, 116, 109, 109, 104, 99, 98, 97,
97, 84, 61),
`2024 Payroll Proj` = c(333, 320, 312, 262, 256, 273, 249, 248, 234, 253, 220,
215, 190, 217, 226, 129, 168, 161, 142, 157, 162, 121,
149, 122, 121, 135, 123, 134, 120, 82),
`2024 Luxury Tax Proj` = c(84.8, 90.8, 68.7, 87.6, 81.5, 86.6, 71.7, 88.4, 86,
82.9, 82, 87.9, 81.3, 70.7, 81.7, 80.6, 78.5, 73.1, 62.2,
82.6, 77, 79.4, 71.2, 72.3, 47, 38.8, 31.2, 54.2, 73.6, 57.7))
library(dplyr)
library(stringr)
# Rename 'Rank' column in payroll_data
payroll_data <- payroll_data %>%
rename(payroll_rank = Rank)
# Join
mlb_stadiums_table <- mlb_stadiums_table %>%
left_join(payroll_data %>% select(Team, payroll_rank), by = "Team")
# Park df, vector of strings
park_data <- c(
"Colorado Rockies 1.351 1.220 1.160 1.295 1.816",
"Boston Red Sox 1.170 1.049 1.119 1.242 1.669",
"Cincinnati Reds 1.133 1.406 1.016 1.003 0.666",
"Kansas City Royals 1.112 0.888 1.123 1.211 1.259",
"Texas Rangers 1.063 1.190 0.978 1.046 0.821",
"Atlanta Braves 1.038 1.009 1.010 0.964 0.974",
"Chicago White Sox 1.035 1.176 0.984 0.907 0.626",
"Pittsburgh Pirates 1.034 0.823 1.076 1.068 1.091",
"Los Angeles Angels 1.031 1.154 1.000 0.890 1.031",
"Chicago Cubs 1.017 1.030 1.050 0.920 1.206",
"Baltimore Orioles 1.013 1.087 1.081 0.877 0.842",
"Milwaukee Brewers 1.005 1.108 0.936 0.963 1.113",
"Miami Marlins 1.004 0.929 1.051 1.014 1.185",
"Philadelphia Phillies 0.998 1.090 0.952 0.973 1.143",
"Washington Nationals 0.998 1.035 0.971 1.031 0.877",
"Arizona Diamondbacks 0.994 0.765 1.095 1.134 1.303",
"Detroit Tigers 0.989 0.818 1.021 1.019 1.803",
"New York Yankees 0.988 1.139 0.933 0.911 0.583",
"Houston Astros 0.975 0.978 0.976 0.995 1.422",
"Minnesota Twins 0.970 0.956 0.974 1.115 0.843",
"Toronto Blue Jays 0.955 0.982 0.945 1.037 0.644",
"Los Angeles Dodgers 0.949 1.218 0.903 0.970 0.667",
"St. Louis Cardinals 0.941 0.892 1.028 0.936 0.749",
"San Francisco Giants 0.933 0.807 1.026 1.001 1.301",
"Cleveland Guardians 0.929 0.935 0.974 0.969 0.842",
"Oakland Athletics 0.908 0.798 0.967 0.996 1.049",
"Tampa Bay Rays 0.890 0.895 0.879 0.981 0.905",
"Seattle Mariners 0.885 0.956 0.907 0.853 0.535",
"New York Mets 0.873 0.900 0.933 0.828 0.636",
"San Diego Padres 0.860 0.931 0.953 0.891 0.593"
)
regex_pattern <- "(.*?) (\\d\\.\\d+) (\\d\\.\\d+) (\\d\\.\\d+) (\\d\\.\\d+) (\\d\\.\\d+)"
park_data_matches <- str_match(park_data, regex_pattern)
Park_rank_df <- as.data.frame(park_data_matches[, -1], stringsAsFactors = FALSE)
names(Park_rank_df) <- c("ParkName", "Runs", "HR", "1B", "2B", "3B")
Park_rank_df <- Park_rank_df %>%
mutate_at(vars(Runs, HR, `1B`, `2B`, `3B`), as.numeric)
# Rename the "ParkName" to "Team"
Park_rank_df <- Park_rank_df %>%
rename(Team = ParkName)
Park_rank_df$Runs <- as.numeric(Park_rank_df$Runs)
# Runs_rank column
Park_rank_df <- Park_rank_df %>%
mutate(Runs_rank = min_rank(desc(Runs)))
# Join "Runs" from Park_rank_df to mlb_stadiums_table
mlb_stadiums_table <- mlb_stadiums_table %>%
left_join(Park_rank_df %>% select(Team, Runs_rank), by = "Team")
#color schemes...
mlb_colors <- list(
"Arizona Diamondbacks" = "#A71930", # Sedona Red
"Atlanta Braves" = "#13274F", # Braves Blue
"Baltimore Orioles" = "#DF4601", # Orioles Orange
"Boston Red Sox" = "#BD3039", # Red Sox Red
"Chicago White Sox" = "#27251F", # White Sox Black
"Chicago Cubs" = "#0E3386", # Cubs Blue
"Cincinnati Reds" = "#C6011F", # Reds Red
"Cleveland Guardians" = "#E31937", # Guardians Red
"Colorado Rockies" = "#33006F", # Rockies Purple
"Detroit Tigers" = "#0C2C56", # Tigers Navy
"Houston Astros" = "#EB6E1F", # Astros Prange
"Kansas City Royals" = "#004687", # Royals Blue
"Los Angeles Angels" = "#BA0021", # Angels Red
"Los Angeles Dodgers" = "#005A9C", # Dodgers Blue
"Miami Marlins" = "#00A3E0", # Marlins Blue
"Milwaukee Brewers" = "#13294B", # Brewers Navy
"Minnesota Twins" = "#002B5C", # Twins Navy
"New York Yankees" = "#003087", # Yankees Navy
"New York Mets" = "#FF5910", # Mets Orange
"Oakland Athletics" = "#003831", # Athletics Green
"Philadelphia Phillies" = "#E81828", # Phillies Red
"Pittsburgh Pirates" = "#FFB712", # Pirates Gold
"San Diego Padres" = "#2F241D", # Padres Brown
"San Francisco Giants" = "#FD5A1E", # Giants Orange
"Seattle Mariners" = "#005C5C", # Mariners Turqoise
"St. Louis Cardinals" = "#C41E3A", # Cardinals Red
"Tampa Bay Rays" = "#8FBCE6", # Rays Gray
"Texas Rangers" = "#C0111F", # Rangers Red
"Toronto Blue Jays" = "#134A8E", # Blue Jays Navy
"Washington Nationals" = "#AB0003" # Nationals Red
)
team_colors_df <- data.frame(Team = names(mlb_colors), team_color = unlist(mlb_colors))
# Join
mlb_stadiums_table <- mlb_stadiums_table %>%
left_join(team_colors_df, by = c("Team" = "Team"))
# Rename col
colnames(mlb_stadiums_table)[ncol(mlb_stadiums_table)] <- "team_color"
library(rvest)
library(magick)
## Warning: package 'magick' was built under R version 4.3.3
## Linking to ImageMagick 6.9.12.98
## Enabled features: cairo, freetype, fftw, ghostscript, heic, lcms, pango, raw, rsvg, webp
## Disabled features: fontconfig, x11
library(dplyr)
# URL
url <- "https://www.launchphotography.com/Ballpark_Panoramas.html"
webpage <- read_html(url)
stadium_names_nodes <- html_nodes(webpage, xpath = '//*[contains(concat( " ", @class, " " ), concat( " ", "m-font-size-13", " " )) and contains(concat( " ", @class, " " ), concat( " ", "font-size-16", " " ))]')
stadium_names <- html_text(stadium_names_nodes)
# xPAth
image_nodes <- html_nodes(webpage, xpath = '//*[(@id = "dm_content")]//img')
image_urls <- html_attr(image_nodes, 'src')
image_urls <- url_absolute(image_urls, url)
# Download images and store them into a list
image_list <- lapply(image_urls, function(x) {
tryCatch({
image_read(x)
}, error = function(e) {
message("Error in downloading image: ", e)
NULL
})
})
## Error in downloading image: Error in curl::curl_download(url, tmp, handle = h): HTTP error 404.
## Error in downloading image: Error in curl::curl_download(url, tmp, handle = h): HTTP error 404.
##
## Error in downloading image: Error in curl::curl_download(url, tmp, handle = h): HTTP error 404.
##
## Error in downloading image: Error in curl::curl_download(url, tmp, handle = h): HTTP error 404.
##
## Error in downloading image: Error in curl::curl_download(url, tmp, handle = h): HTTP error 404.
min_length <- min(length(stadium_names), length(image_list))
stadium_images_df <- tibble(
Stadium = stadium_names[1:min_length],
Image = I(image_list[1:min_length])
)
for (image_object in stadium_images_df$Image) {
if (!is.null(image_object)) {
print(image_object)
}
}
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 474 sRGB FALSE 234605 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 614 sRGB FALSE 318129 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 540 sRGB FALSE 291537 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 499 sRGB FALSE 279376 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 523 sRGB FALSE 295955 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 499 sRGB FALSE 272255 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 409 sRGB FALSE 231265 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 541 sRGB FALSE 311994 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 610 sRGB FALSE 357814 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 584 sRGB FALSE 291814 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 396 sRGB FALSE 231548 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 503 sRGB FALSE 307399 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 591 sRGB FALSE 296912 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 581 sRGB FALSE 324363 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 580 sRGB FALSE 419377 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 681 sRGB FALSE 375248 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 565 sRGB FALSE 299645 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 484 sRGB FALSE 307413 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 507 sRGB FALSE 301651 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 497 sRGB FALSE 278870 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 634 sRGB FALSE 397795 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 613 sRGB FALSE 362645 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 596 sRGB FALSE 318296 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 499 sRGB FALSE 274633 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 619 sRGB FALSE 336081 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 615 sRGB FALSE 350995 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 586 sRGB FALSE 358121 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 406 sRGB FALSE 207396 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 533 sRGB FALSE 267994 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 488 sRGB FALSE 241896 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 530 sRGB FALSE 269468 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 436 sRGB FALSE 221814 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 533 sRGB FALSE 346027 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 572 sRGB FALSE 314691 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 489 sRGB FALSE 285644 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 479 sRGB FALSE 251386 72x72
# Row removal
rows_to_remove <- c(31, 32, 33, 34, 35, 36)
stadium_images_df <- stadium_images_df[-rows_to_remove, ]
library(dplyr)
library(stringr)
library(magick)
stadium_images_df$Stadium <- gsub("Busch Stadium )St. Louis Cardinals)", "Busch Stadium (St. Louis Cardinals)", stadium_images_df$Stadium)
# Step 1
stadium_images_df <- stadium_images_df %>%
mutate(Stadium = str_extract(Stadium, "(?<=\\().+?(?=\\))"))
# Step 2
stadium_images_df <- stadium_images_df %>%
rename(Team = Stadium) %>%
select(Team, Image)
# Step 3
mlb_stadiums_table <- mlb_stadiums_table %>%
left_join(stadium_images_df, by = "Team")
for (i in seq_along(mlb_stadiums_table$Team)) {
image_row <- stadium_images_df %>%
filter(Team == mlb_stadiums_table$Team[i])
if (nrow(image_row) == 1 && !is.null(image_row$Image[[1]])) {
mlb_stadiums_table$Image[i] <- image_row$Image
}
}
## Warning: Unknown or uninitialised column: `Image`.
# Looping over the list of image objects to display them
for (i in seq_along(mlb_stadiums_table$Image)) {
image_object <- mlb_stadiums_table$Image[[i]]
if (!is.null(image_object)) {
print(image_object)
}
}
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 484 sRGB FALSE 307413 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 499 sRGB FALSE 274633 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 610 sRGB FALSE 357814 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 596 sRGB FALSE 318296 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 540 sRGB FALSE 291537 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 507 sRGB FALSE 301651 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 581 sRGB FALSE 324363 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 613 sRGB FALSE 362645 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 499 sRGB FALSE 279376 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 474 sRGB FALSE 234605 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 615 sRGB FALSE 350995 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 634 sRGB FALSE 397795 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 619 sRGB FALSE 336081 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 584 sRGB FALSE 291814 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 406 sRGB FALSE 207396 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 503 sRGB FALSE 307399 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 580 sRGB FALSE 419377 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 488 sRGB FALSE 241896 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 523 sRGB FALSE 295955 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 497 sRGB FALSE 278870 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 499 sRGB FALSE 272255 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 541 sRGB FALSE 311994 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 591 sRGB FALSE 296912 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 396 sRGB FALSE 231548 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 565 sRGB FALSE 299645 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 681 sRGB FALSE 375248 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 533 sRGB FALSE 267994 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 586 sRGB FALSE 358121 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 409 sRGB FALSE 231265 72x72
## # A tibble: 1 × 7
## format width height colorspace matte filesize density
## <chr> <int> <int> <chr> <lgl> <int> <chr>
## 1 JPEG 1600 614 sRGB FALSE 318129 72x72
Method: We first compiled a team’s winning % in their current stadium by calculating their record over the years. The most recent stadium built was The Texas Rangers, 2020 Globe Life Field, which they already won a World Series in as of last year. For a team like the Chicago Cubs and Boston Red Sox, who play in over 100 year old stadiums, I decided to begin tracking data from 1974, which is 50 years ago. I neglected to take into account the time of MLB when the racial barrier wasn’t yet broken, for obvious reasons.
Graph1: Our first analysis is to try and find a correlation between home winning pct and a variable. While there are multiple factors that go into winning a baseball game, a team generally performs better at home than on the road. Maybe it is because they are comfortable in the dugouts, or fist bump the janitor before gametime, or come from their family homes. We show this fact by producing a graph of 2023 splits of how many more wins a team gets at home than on the Road. In fact, the graph shows that no team has a worse winning pct% on the road than home, in their current stadium. 2023 is significant for scheduling because it is the first time the MLB scheduled all 32 teams to play each other for at least one series a year. Prior, I would see my St. Louis Cardinals play the New York Yankees once every 4 years, even though these are the two leading franchises of World Series Trophies, and at the top of annual attendance as well.
Graph2: Next, we graphed the home winning % over time, and ordered the Team’s by the most average attendance to least. Visualizing a correlation would mean a trend of slope in the data, but we do not see that. The team who averages the least attendance, Tampa Bay Rays, have just as good of a Home record as the team who averages the most, Los Angeles Dodgers. The Rays still have their own home field advantage, with a unique park design, including a roof and Artificial Turf. It does look to be a trend that team’s with lower winning percentages average less attendance.
library(dplyr)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.3
library(stringr)
# difference between HomeWinPct and AwayWinPct
win_difference <- home_win_df$HomeWinPct - away_win_df$AwayWinPct
# new df for difference
win_difference_df <- tibble(Team = home_win_df$Team, WinDifference = win_difference)
phillies_index <- which(home_win_df$Team == "Philadelphia Phillies")
win_difference[phillies_index] <- 0.086
win_difference_df <- tibble(Team = home_win_df$Team, WinDifference = win_difference)
# Plot the difference
ggplot(win_difference_df, aes(x = reorder(Team, WinDifference), y = WinDifference)) +
geom_bar(stat = "identity", fill = "skyblue", color = "black") +
coord_flip() +
labs(title = "Difference in HomeWinPct and AwayWinPct by Team",
x = "Team",
y = "Difference (HomeWinPct - AwayWinPct)") +
theme_minimal()
########## home win pct vs attendance
# Step 1
merged_df <- merge(home_win_df, attendance_df, by.x = "Team", by.y = "franchise", all.x = TRUE)
# Step 2
merged_df <- merged_df[order(merged_df$average_attendance, decreasing = TRUE), ]
merged_df <- merged_df %>%
left_join(mlb_stadiums_table %>% select(Team, team_color), by = "Team")
# Step 3
library(ggplot2)
ggplot(merged_df, aes(x = reorder(Team, -average_attendance), y = HomeWinPct, fill = team_color)) +
geom_bar(stat = "identity") +
labs(title = "Home Win Percentage vs. Team by Average Attendance",
x = "Team (Ordered by Average Attendance)",
y = "Home Win Percentage") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_fill_identity()
Method: According to the MLB, 47.3% of Runs Batted In 2019 were produced via a Home Run. With rising strikeout rates, it is evident that a hitters swing for the fences approach has become more common as of late. I would predict this is due to rising velocity by pitchers across the board. Man, the more I talk the more I wish I had a plot for. Our goal is to classify what are the most “hitter” friendly parks and the most “pitcher” friendly parks.
Graph1: Plotting Home Runs vs. Doubles.
Plotting Home Runs vs. ParkFactor… A known X factor of hitting home runs is weather. A baseball will carry further in the air with high humidity, altitude, and temperature. These are all modifiers that make up the density of air. Note that high humidity actually means less dense air. Conversely, cold air will dampen a ball’s flight by being more dense. It is estimated that every 10 degrees above 75 degrees Fahrenheit will carry a baseball 3 feet further.
Plotting Home Runs vs. Distance to Center Field.
Plotting Home - Away ERA splits per team (negative is bad).
Finally, we give the visual of each ballpark and some their attributes calculated based on the work we have established.
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
#
A tibble: 1 × 7 format width height colorspace matte filesize density
1 JPEG 1600 613 sRGB TRUE 0 72x72
# A tibble: 1 × 7 format width height colorspace matte filesize density
1 JPEG 1600 581 sRGB TRUE 0 72x72
# A tibble: 1 × 7 format width height colorspace matte filesize density
1 JPEG 1600 584 sRGB TRUE 0 72x72
# A tibble: 1 × 7 format width height colorspace matte filesize density
1 JPEG 1600 497 sRGB TRUE 0 72x72
# A tibble: 1 × 7 format width height colorspace matte filesize density
1 JPEG 1600 591 sRGB TRUE 0 72x72
# A tibble: 1 × 7 format width height colorspace matte filesize density
1 JPEG 1600 503 sRGB TRUE 0 72x72
# A tibble: 1 × 7 format width height colorspace matte filesize density
1 JPEG 1600 540 sRGB TRUE 0 72x72
# A tibble: 1 × 7 format width height colorspace matte filesize density
1 JPEG 1600 614 sRGB TRUE 0 72x72
# A tibble: 1 × 7 format width height colorspace matte filesize density
1 JPEG 1600 596 sRGB TRUE 0 72x72
# A tibble: 1 × 7 format width height colorspace matte filesize density
1 JPEG 1600 615 sRGB TRUE 0 72x72
# A tibble: 1 × 7 format width height colorspace matte filesize density
1 JPEG 1600 406 sRGB TRUE 0 72x72
# A tibble: 1 × 7 format width height colorspace matte filesize density
1 JPEG 1600 533 sRGB TRUE 0 72x72
# A tibble: 1 × 7 format width height colorspace matte filesize density
1 JPEG 1600 681 sRGB TRUE 0 72x72
# A tibble: 1 × 7 format width height colorspace matte filesize density
1 JPEG 1600 634 sRGB TRUE 0 72x72
# A tibble: 1 × 7 format width height colorspace matte filesize density
1 JPEG 1600 580 sRGB TRUE 0 72x72
# A tibble: 1 × 7 format width height colorspace matte filesize density
1 JPEG 1600 507 sRGB TRUE 0 72x72
# A tibble: 1 × 7 format width height colorspace matte filesize density
1 JPEG 1600 565 sRGB TRUE 0 72x72
# A tibble: 1 × 7 format width height colorspace matte filesize density
1 JPEG 1600 586 sRGB TRUE 0 72x72
# A tibble: 1 × 7 format width height colorspace matte filesize density
1 JPEG 1600 484 sRGB TRUE 0 72x72
# A tibble: 1 × 7 format width height colorspace matte filesize density
1 JPEG 1600 610 sRGB TRUE 0 72x72
# A tibble: 1 × 7 format width height colorspace matte filesize density
1 JPEG 1600 619 sRGB TRUE 0 72x72
# A tibble: 1 × 7 format width height colorspace matte filesize density
1 JPEG 1600 488 sRGB TRUE 0 72x72
# A tibble: 1 × 7 format width height colorspace matte filesize density
1 JPEG 1600 396 sRGB TRUE 0 72x72
# A tibble: 1 × 7 format width height colorspace matte filesize density
1 JPEG 1600 409 sRGB TRUE 0 72x72
# A tibble: 1 × 7 format width height colorspace matte filesize density
1 JPEG 1600 541 sRGB TRUE 0 72x72
# A tibble: 1 × 7 format width height colorspace matte filesize density
1 JPEG 1600 499 sRGB TRUE 0 72x72
# A tibble: 1 × 7 format width height colorspace matte filesize density
1 JPEG 1600 499 sRGB TRUE 0 72x72
# A tibble: 1 × 7 format width height colorspace matte filesize density
1 JPEG 1600 499 sRGB TRUE 0 72x72
# A tibble: 1 × 7 format width height colorspace matte filesize density
1 JPEG 1600 523 sRGB TRUE 0 72x72
# A tibble: 1 × 7 format width height colorspace matte filesize density
1 JPEG 1600 474 sRGB TRUE 0 72x72
What is the best way to Win a Trophy? Just ask Michigan, Tom Brady, or the Houston Astros Organization… Gain an advantage over your opponent. In Sports, “signing” is akin to sign language, where a coach uses nonverbal cues/gestures to relay information to their players about what strategy to implore on a given play. By the time the players completely understand this code, it is changed so an opponent doesn’t study film to figure it out. Michigan Football won a National Championship with the storylines of “sign stealing” dominating. The 2017 Houston Astros were found to have stole signs a couple years after their championship when a former player began whistleblowing. They supposedly interpreted video feed from the center field camera to figure out a team’s next move during the game: this could include what pitch would be thrown, if a player is stealing, and more. For my language analysis section of the project, I will not engage in “sign stealing”, but I will do my analysis on how this hidden language affected the Houston Astros in 2017, and if the numbers do show a clear advantage. One would think that their “home” numbers would be better than “away” because bench players would bang on trash cans to signal an offspeed pitch coming.
We see that the 2017 Astros did not have an exceptional home record compared to other teams in their championship years.
Among the Astros’ 4 best hitters in 2017 (including MVP Jose Altuve), only Carlos Correa had a higher Home avg. than an Away Avg. for hitting (still averaged .300 on the road, impressively). Neither the team record nor home vs. away splits have stood out to me, and I cannot statistically see how significant their cheating was. Perhaps this advantage was more momentary than long-standing.
However, the MLB community noticed something peculiar in Game 6 of the 2019 ALCS (meaning, the winner advanced to the Championship). Batter Jose Altuve was up against flamethrower Aroldis Chapman, who is notorious for reaching record-breaking 103 MPH on any given night. In the bottom of the ninth, with a tie game and two outs, Jose Altuve turns on the first pitch which clocks in at 84 mph. This ball is blasted out of the park and when Altuve comes home to his teammates who are crowded around home plate, he clenches his jersey hard and yells at them not to rip it off. The imprint of a wire in his collar is exposed as he is rounding third. Fans, players, and analysts alike are led to believe that Altuve had a buzzer in his jersey which signaled the pitch would be offspeed. While the GM and Coach were fired from the team, the players were not punished, and many moved on to big contracts on different teams. In 2022, the Houston Astros won the World Series again.
library(dplyr)
library(ggplot2)
library(stringr)
champions <- data.frame(
Year = c(2023, 2022, 2019, 2018, 2017, 2016, 2015, 2014, 2013, 2012, 2011),
Team = c("Texas Rangers", "Houston Astros", "Washington Nationals",
"Boston Red Sox", "Houston Astros", "Chicago Cubs",
"Kansas City Royals", "San Francisco Giants", "Boston Red Sox",
"San Francisco Giants", "St. Louis Cardinals")
)
extract_home_wins <- function(home_record) {
win_loss <- str_split(home_record, "-", simplify = TRUE)
wins <- as.numeric(win_loss[1])
if (!is.na(wins)) {
return(wins)
}
0
}
home_wins <- numeric(nrow(champions))
for (i in 1:nrow(champions)) {
team_name <- champions$Team[i]
championship_year <- champions$Year[i]
year_standings <- all_standings[[which(all_standings$Year == championship_year), "Standings"]]
if (is.data.frame(year_standings[[1]])) {
standings_df <- year_standings[[1]]
team_home_record <- standings_df$HOME[standings_df$TEAM == team_name]
if (length(team_home_record) > 0) {
home_wins[i] <- extract_home_wins(team_home_record)
} else {
print(paste("No home record found for team", team_name, "in year", championship_year))
}
} else {
print(paste("Standings data for year", championship_year, "is not structured as expected."))
}
}
champions_with_wins <- champions
champions_with_wins$HomeWins <- home_wins
champions_with_wins <- champions_with_wins %>%
inner_join(mlb_stadiums_table[c("Team", "team_color")], by = "Team")
champions_with_wins$TeamYear <- paste(champions_with_wins$Year, champions_with_wins$Team)
ordered_champions <- champions_with_wins %>%
arrange(desc(Year))
# Plotting home wins using the team color
ggplot(ordered_champions, aes(x = reorder(TeamYear, -HomeWins), y = HomeWins, fill = TeamYear)) +
geom_bar(stat = "identity") +
coord_flip() + # Flip axes for horizontal bars
labs(x = "Team (Year)", y = "Home Wins",
title = "Home Wins in Championship Season for MLB Teams") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1), # Angle the x-axis text
legend.position = "none") + # No legend required
scale_fill_manual(values = setNames(ordered_champions$team_color, ordered_champions$TeamYear)) +
scale_y_continuous(breaks = seq(0, max(ordered_champions$HomeWins, na.rm = TRUE) + 5, by = 5)) # Y-axis increments by 5
##########plotting individual hitter stats
library(ggplot2)
# Jose Altuve
ggplot(JoseAltuve, aes(x = BREAKDOWN, y = AVG, fill = BREAKDOWN)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Jose Altuve: Home vs. Away AVG",
x = "Location",
y = "AVG",
fill = "Location") +
theme_minimal()
# Alex Bregman
ggplot(AlexBregman, aes(x = BREAKDOWN, y = AVG, fill = BREAKDOWN)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Alex Bregman: Home vs. Away AVG",
x = "Location",
y = "AVG",
fill = "Location") +
theme_minimal()
# George Springer
ggplot(GeorgeSpringer, aes(x = BREAKDOWN, y = AVG, fill = BREAKDOWN)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "George Springer: Home vs. Away AVG",
x = "Location",
y = "AVG",
fill = "Location") +
theme_minimal()
# Carlos Correa
ggplot(CarlosCorrea, aes(x = BREAKDOWN, y = AVG, fill = BREAKDOWN)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Carlos Correa: Home vs. Away AVG",
x = "Location",
y = "AVG",
fill = "Location") +
theme_minimal()